library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
# get filenames of all csvs
path_to_csvs <- "~/Google_Drive/Simplification-for-generalization/simplification-data"
all_csvs <- list.files(path = path_to_csvs, pattern = "\\.csv")

# load all csvs into a single dataframe
all_runs <- data.frame()
for (filename in all_csvs) {
  tmp_df <- read.csv(paste(path_to_csvs, filename, sep = '/'), stringsAsFactors = F)
  colnames(tmp_df) <- sapply(colnames(tmp_df), str_trim)
  
  s <- unlist(str_split(filename, '[.-]'))
  s <- s[4:length(s)-1]
  
  tmp_df$problem <- str_c(s, sep = '-', collapse = '-')
  all_runs <<- bind_rows(all_runs, tmp_df)
}

Nics plot

unsimplified_info <- all_runs %>%
  filter(type == "unsimplified") %>%
  mutate(genPre = testError == 0) %>%
  mutate(originalSize = programSize) %>%
  select(problem, log, method, originalSize, genPre)

all_runs %>%
  filter(type == "simplified") %>%
  mutate(genPost = testError == 0,
         simplifiedSize = programSize) %>%
  inner_join(unsimplified_info, by = c("problem", "log", "method")) %>%
  unite(quad, genPre, genPost, sep = "_") %>%
  select(problem, method, originalSize, simplifiedSize, quad) %>%
  ggplot(aes(x = originalSize,
             y = simplifiedSize)) +
  geom_point(aes(color = problem),
             alpha = 0.3,
             size = 0.3, 
             shape = 4) +
  geom_abline(slope = 1, 
              intercept = 0,
              color = 'grey') +
  geom_smooth(method = "lm",
              se = FALSE,
              color = 'black',
              linetype = 2,
              size = 0.5) +
  facet_grid(quad ~ method) +
  theme_bw() +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))

Nics plot rotated

unsimplified_info <- all_runs %>%
  filter(type == "unsimplified") %>%
  mutate(genPre = testError == 0) %>%
  mutate(originalSize = programSize) %>%
  select(problem, log, method, originalSize, genPre)

all_runs %>%
  filter(type == "simplified") %>%
  mutate(genPost = testError == 0,
         simplifiedSize = programSize) %>%
  inner_join(unsimplified_info, by = c("problem", "log", "method")) %>%
  unite(quad, genPre, genPost, sep = "_") %>%
  select(problem, method, originalSize, simplifiedSize, quad) %>%
  ggplot(aes(x = originalSize,
             y = simplifiedSize)) +
  geom_point(aes(color = method),
             alpha = 0.3,
             size = 0.5, 
             shape = 4) +
  geom_abline(slope = 1, 
              intercept = 0,
              color = 'grey') +
  geom_smooth(method = "lm",
              se = FALSE,
              color = 'black',
              linetype = 2,
              size = 0.5) +
  facet_grid(quad ~ problem,
             scales = "free") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1)) +
  guides(colour = guide_legend(override.aes = list(alpha = 1)))

Problem Vs Avgerage program Size by Method

Table

simp_runs <- all_runs %>%
  filter(type == "simplified") %>%
  group_by(method, problem) %>%
  summarise(avgProgramSize = round(mean(programSize), 3)) %>%
  spread(method, avgProgramSize)

prog_size_table <- all_runs %>%
  filter(type == "unsimplified") %>%
  group_by(problem) %>%
  summarise(none = round(mean(programSize), 3)) %>%
  inner_join(simp_runs, by = c("problem"))
write.csv(prog_size_table, "prog_size_table.csv")

Plot

horiz_lines_df <- melt(prog_size_table, 
                       id.vars = c("problem"), 
                       measure.vars = c("none", "Genome", "GenomeBacktracking", "GenomeBacktrackingNoop", "GenomeNoop", "Program"),
                       value.name = "size") %>%
  filter(variable == 'none')
horiz_lines_df$x1 <- seq(0.55, 23.55, 1)
horiz_lines_df$x2 <- seq(1.45, 24.45, 1)

melt(prog_size_table, 
     id.vars = c("problem"), 
     measure.vars = c("none", "Genome", "GenomeBacktracking", "GenomeBacktrackingNoop", "GenomeNoop", "Program"),
     value.name = "size") %>%
  filter(variable != 'none') %>%
  ggplot(aes(x = problem,
             y = size,
             fill = variable)) + 
  geom_bar(stat = "identity",
           position = "dodge") +
  geom_segment(data = horiz_lines_df,
               aes(x = x1,
                   y = size,
                   xend = x2,
                   yend = size),
               size = 1.5) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1)) +
  labs(x = "Problem",
       y = "Program Size")

Problem Vs Percent Generalized by Method

Table

simp_runs <- all_runs %>%
  filter(type == "simplified") %>%
  mutate(generalized = ifelse(testError == 0, 1, 0)) %>%
  group_by(method, problem) %>%
  summarise(percentGeneralized = mean(generalized)) %>%
  spread(method, percentGeneralized)

perct_generalized_table <- all_runs %>%
  filter(type == "unsimplified") %>%
  mutate(generalized = ifelse(testError == 0, 1, 0)) %>%
  group_by(problem) %>%
  summarise(none = mean(generalized)) %>%
  inner_join(simp_runs, by = c("problem"))
write.csv(perct_generalized_table, "perct_generalized_table.csv")

Plot

horiz_lines_df <- melt(perct_generalized_table, 
                       id.vars = c("problem"), 
                       measure.vars = c("none", "Genome", "GenomeBacktracking", "GenomeBacktrackingNoop", "GenomeNoop", "Program"),
                       value.name = "pct_gener") %>%
  filter(variable == 'none')
horiz_lines_df$x1 <- seq(0.55, 23.55, 1)
horiz_lines_df$x2 <- seq(1.45, 24.45, 1)

melt(perct_generalized_table, 
     id.vars = c("problem"), 
     measure.vars = c("none", "Genome", "GenomeBacktracking", "GenomeBacktrackingNoop", "GenomeNoop", "Program"),
     value.name = "pct_gener") %>%
  filter(variable != 'none') %>%
  ggplot(aes(x = problem,
             y = pct_gener,
             fill = variable)) + 
  geom_bar(stat = "identity",
           position = "dodge") +
  geom_segment(data = horiz_lines_df,
               aes(x = x1,
                   y = pct_gener,
                   xend = x2,
                   yend = pct_gener),
               size = 1.5) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 15, hjust = 1)) +
  labs(x = "Problem",
       y = "% Generalized")

Simplifified Size vs Percent Generalized

Plot a

unsimplified_that_generalize <- all_runs %>%
  filter(type == "unsimplified",
         testError == 0) %>%
  select(log, problem) %>%
  distinct()

all_runs %>%
  inner_join(unsimplified_that_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = ifelse(testError == 0, 1, 0)) %>%
  group_by(programSize, problem) %>%
  summarise(percentGeneralized = mean(generalized)) %>%
  ggplot(aes(x = programSize,
             y = percentGeneralized)) +
  geom_point(size = 1,
             alpha = 0.5) +
  #facet_wrap(~ problem) +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Generalized",
       x = "Post Simplification Program Size",
       y = "% Generalized")

Plot b

unsimplified_that_dont_generalize <- all_runs %>%
  filter(type == "unsimplified",
         testError > 0) %>%
  select(log, problem) %>%
  distinct()

all_runs %>%
  inner_join(unsimplified_that_dont_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = ifelse(testError == 0, 1, 0)) %>%
  group_by(programSize, problem) %>%
  summarise(percentGeneralized = mean(generalized)) %>%
  ggplot(aes(x = programSize,
             y = percentGeneralized)) +
  geom_point(size = 1,
             alpha = 0.5) +
  #facet_wrap(~ problem) +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Didn't Generalized",
       x = "Post Simplification Program Size",
       y = "% Generalized")

Density of simplified size

Not faceted

all_runs %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5) +
  #facet_wrap(~ problem, scales = "free") +
  theme_bw() + 
  labs(x = "Post Simplification Program Size")

Facet where unsimplified program generalized

all_runs %>%
  inner_join(unsimplified_that_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5) +
  facet_wrap(~ problem, scales = "free") +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Generalized",
       x = "Post Simplification Program Size")

Facet where unsimplified program DIDN’T generalized

all_runs %>%
  inner_join(unsimplified_that_dont_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5) +
  facet_wrap(~ problem, scales = "free") +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Didn't Generalized",
       x = "Post Simplification Program Size")

Desity (stat = “count”) of simplified size

Not faceted

all_runs %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5,
               stat = "count") +
  theme_bw() + 
  labs(x = "Post Simplification Program Size") +
  scale_y_log10()

Not faceted A

all_runs %>%
  inner_join(unsimplified_that_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5,
               stat = "count") +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Generalized",
       x = "Post Simplification Program Size")

Not faceted B

all_runs %>%
  inner_join(unsimplified_that_dont_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5,
               stat = "count") +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Didn't Generalized",
       x = "Post Simplification Program Size")

Facet where unsimplified program generalized

all_runs %>%
  inner_join(unsimplified_that_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5,
               stat = "count") +
  facet_wrap(~ problem, scales = "free") +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Generalized",
       x = "Post Simplification Program Size")

Facet where unsimplified program DIDN’T generalized

all_runs %>%
  inner_join(unsimplified_that_dont_generalize, by = c("log", "problem")) %>%
  filter(type == "simplified") %>%
  mutate(generalized = testError == 0) %>%
  ggplot(aes(x = programSize,
             fill = generalized,
             color = generalized)) + 
  geom_density(alpha = 0.5,
               stat = "count") +
  facet_wrap(~ problem, scales = "free") +
  theme_bw() + 
  labs(title = "Unsimplified Programs that Didn't Generalized",
       x = "Post Simplification Program Size")